import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.cluster.vq import kmeans
from scipy.cluster.vq import vq
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
raw_file = r'https://github.com/ThucDao/OutlierDetection/blob/main/data/baby_log.xls?raw=true'
df = pd.read_excel(raw_file, sheet_name='Baby Activities', usecols=['Activity', 'Type', 'Start', 'Finish'])
df.head(5)
| Activity | Type | Start | Finish | |
|---|---|---|---|---|
| 0 | Diapers | Poo | 2022-08-17, 21:25 | 2022-08-17, 21:25 |
| 1 | Feeding | Left Breast | 2022-08-17, 20:18 | 2022-08-17, 20:35 |
| 2 | Feeding | Right Breast | 2022-08-17, 16:27 | 2022-08-17, 16:48 |
| 3 | Feeding | Left Breast | 2022-08-17, 12:22 | 2022-08-17, 12:42 |
| 4 | Feeding | Right Breast | 2022-08-17, 10:44 | 2022-08-17, 11:00 |
# check the format of columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 420 entries, 0 to 419 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Activity 420 non-null object 1 Type 420 non-null object 2 Start 420 non-null object 3 Finish 420 non-null object dtypes: object(4) memory usage: 13.2+ KB
# keep only the rows having 'Feeding' value in 'Activity' column
df = df[df['Activity']=='Feeding']
df = df.reset_index(drop=True)
# check again the number of rows and columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 196 entries, 0 to 195 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Activity 196 non-null object 1 Type 196 non-null object 2 Start 196 non-null object 3 Finish 196 non-null object dtypes: object(4) memory usage: 6.2+ KB
# convert the 'Start' and 'Finish' columns to datetime format
df['Start']= pd.to_datetime(df['Start'])
df['Finish']= pd.to_datetime(df['Finish'])
# check again the format of columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 196 entries, 0 to 195 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Activity 196 non-null object 1 Type 196 non-null object 2 Start 196 non-null datetime64[ns] 3 Finish 196 non-null datetime64[ns] dtypes: datetime64[ns](2), object(2) memory usage: 6.2+ KB
df.head(5)
| Activity | Type | Start | Finish | |
|---|---|---|---|---|
| 0 | Feeding | Left Breast | 2022-08-17 20:18:00 | 2022-08-17 20:35:00 |
| 1 | Feeding | Right Breast | 2022-08-17 16:27:00 | 2022-08-17 16:48:00 |
| 2 | Feeding | Left Breast | 2022-08-17 12:22:00 | 2022-08-17 12:42:00 |
| 3 | Feeding | Right Breast | 2022-08-17 10:44:00 | 2022-08-17 11:00:00 |
| 4 | Feeding | Right Breast | 2022-08-17 06:26:00 | 2022-08-17 06:42:00 |
df.tail(5)
| Activity | Type | Start | Finish | |
|---|---|---|---|---|
| 191 | Feeding | Left Breast | 2022-07-21 14:00:00 | 2022-07-21 14:15:00 |
| 192 | Feeding | Right Breast | 2022-07-21 04:13:00 | 2022-07-21 04:28:00 |
| 193 | Feeding | Right Breast | 2022-07-21 03:28:00 | 2022-07-21 03:34:00 |
| 194 | Feeding | Right Breast | 2022-07-21 03:18:00 | 2022-07-21 03:26:00 |
| 195 | Feeding | Right Breast | 2022-07-21 02:59:00 | 2022-07-21 03:10:00 |
# reverse rows and reset index
df = df[::-1].reset_index(drop=True)
df.head(5)
| Activity | Type | Start | Finish | |
|---|---|---|---|---|
| 0 | Feeding | Right Breast | 2022-07-21 02:59:00 | 2022-07-21 03:10:00 |
| 1 | Feeding | Right Breast | 2022-07-21 03:18:00 | 2022-07-21 03:26:00 |
| 2 | Feeding | Right Breast | 2022-07-21 03:28:00 | 2022-07-21 03:34:00 |
| 3 | Feeding | Right Breast | 2022-07-21 04:13:00 | 2022-07-21 04:28:00 |
| 4 | Feeding | Left Breast | 2022-07-21 14:00:00 | 2022-07-21 14:15:00 |
df.tail(5)
| Activity | Type | Start | Finish | |
|---|---|---|---|---|
| 191 | Feeding | Right Breast | 2022-08-17 06:26:00 | 2022-08-17 06:42:00 |
| 192 | Feeding | Right Breast | 2022-08-17 10:44:00 | 2022-08-17 11:00:00 |
| 193 | Feeding | Left Breast | 2022-08-17 12:22:00 | 2022-08-17 12:42:00 |
| 194 | Feeding | Right Breast | 2022-08-17 16:27:00 | 2022-08-17 16:48:00 |
| 195 | Feeding | Left Breast | 2022-08-17 20:18:00 | 2022-08-17 20:35:00 |
# create a new column of feeding time
df['Feeding_time'] = df['Finish'] - df['Start']
df.head(5)
| Activity | Type | Start | Finish | Feeding_time | |
|---|---|---|---|---|---|
| 0 | Feeding | Right Breast | 2022-07-21 02:59:00 | 2022-07-21 03:10:00 | 0 days 00:11:00 |
| 1 | Feeding | Right Breast | 2022-07-21 03:18:00 | 2022-07-21 03:26:00 | 0 days 00:08:00 |
| 2 | Feeding | Right Breast | 2022-07-21 03:28:00 | 2022-07-21 03:34:00 | 0 days 00:06:00 |
| 3 | Feeding | Right Breast | 2022-07-21 04:13:00 | 2022-07-21 04:28:00 | 0 days 00:15:00 |
| 4 | Feeding | Left Breast | 2022-07-21 14:00:00 | 2022-07-21 14:15:00 | 0 days 00:15:00 |
# create a new column of feeding interval
df['Feeding_interval'] = df['Start'].shift(periods=-1) - df['Finish']
df.head(5)
| Activity | Type | Start | Finish | Feeding_time | Feeding_interval | |
|---|---|---|---|---|---|---|
| 0 | Feeding | Right Breast | 2022-07-21 02:59:00 | 2022-07-21 03:10:00 | 0 days 00:11:00 | 0 days 00:08:00 |
| 1 | Feeding | Right Breast | 2022-07-21 03:18:00 | 2022-07-21 03:26:00 | 0 days 00:08:00 | 0 days 00:02:00 |
| 2 | Feeding | Right Breast | 2022-07-21 03:28:00 | 2022-07-21 03:34:00 | 0 days 00:06:00 | 0 days 00:39:00 |
| 3 | Feeding | Right Breast | 2022-07-21 04:13:00 | 2022-07-21 04:28:00 | 0 days 00:15:00 | 0 days 09:32:00 |
| 4 | Feeding | Left Breast | 2022-07-21 14:00:00 | 2022-07-21 14:15:00 | 0 days 00:15:00 | 0 days 02:16:00 |
df.tail(5)
| Activity | Type | Start | Finish | Feeding_time | Feeding_interval | |
|---|---|---|---|---|---|---|
| 191 | Feeding | Right Breast | 2022-08-17 06:26:00 | 2022-08-17 06:42:00 | 0 days 00:16:00 | 0 days 04:02:00 |
| 192 | Feeding | Right Breast | 2022-08-17 10:44:00 | 2022-08-17 11:00:00 | 0 days 00:16:00 | 0 days 01:22:00 |
| 193 | Feeding | Left Breast | 2022-08-17 12:22:00 | 2022-08-17 12:42:00 | 0 days 00:20:00 | 0 days 03:45:00 |
| 194 | Feeding | Right Breast | 2022-08-17 16:27:00 | 2022-08-17 16:48:00 | 0 days 00:21:00 | 0 days 03:30:00 |
| 195 | Feeding | Left Breast | 2022-08-17 20:18:00 | 2022-08-17 20:35:00 | 0 days 00:17:00 | NaT |
# remove the last row because feeding interval is N/A there
df.drop(index=df.index[-1], axis=0, inplace=True)
df.tail(5)
| Activity | Type | Start | Finish | Feeding_time | Feeding_interval | |
|---|---|---|---|---|---|---|
| 190 | Feeding | Left Breast | 2022-08-17 04:18:00 | 2022-08-17 04:34:00 | 0 days 00:16:00 | 0 days 01:52:00 |
| 191 | Feeding | Right Breast | 2022-08-17 06:26:00 | 2022-08-17 06:42:00 | 0 days 00:16:00 | 0 days 04:02:00 |
| 192 | Feeding | Right Breast | 2022-08-17 10:44:00 | 2022-08-17 11:00:00 | 0 days 00:16:00 | 0 days 01:22:00 |
| 193 | Feeding | Left Breast | 2022-08-17 12:22:00 | 2022-08-17 12:42:00 | 0 days 00:20:00 | 0 days 03:45:00 |
| 194 | Feeding | Right Breast | 2022-08-17 16:27:00 | 2022-08-17 16:48:00 | 0 days 00:21:00 | 0 days 03:30:00 |
# convert the 'Feeding_time' and 'Feeding_interval' columns from datetime format to minutes
df['Feeding_time'] = df['Feeding_time'].dt.total_seconds().div(60).astype(int)
df['Feeding_interval'] = df['Feeding_interval'].dt.total_seconds().div(60).astype(int)
df.head(5)
| Activity | Type | Start | Finish | Feeding_time | Feeding_interval | |
|---|---|---|---|---|---|---|
| 0 | Feeding | Right Breast | 2022-07-21 02:59:00 | 2022-07-21 03:10:00 | 11 | 8 |
| 1 | Feeding | Right Breast | 2022-07-21 03:18:00 | 2022-07-21 03:26:00 | 8 | 2 |
| 2 | Feeding | Right Breast | 2022-07-21 03:28:00 | 2022-07-21 03:34:00 | 6 | 39 |
| 3 | Feeding | Right Breast | 2022-07-21 04:13:00 | 2022-07-21 04:28:00 | 15 | 572 |
| 4 | Feeding | Left Breast | 2022-07-21 14:00:00 | 2022-07-21 14:15:00 | 15 | 136 |
# check again the format of columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 195 entries, 0 to 194 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Activity 195 non-null object 1 Type 195 non-null object 2 Start 195 non-null datetime64[ns] 3 Finish 195 non-null datetime64[ns] 4 Feeding_time 195 non-null int32 5 Feeding_interval 195 non-null int32 dtypes: datetime64[ns](2), int32(2), object(2) memory usage: 9.1+ KB
# make a distribution plot of time of each feeding
sns.displot(df['Feeding_time'], kde=True)
plt.xlabel('Minutes of each feeding', fontsize = 12)
plt.ylabel('Frequency', fontsize = 12)
plt.title('Feeding time distribution', fontsize = 15)
plt.show()
print(f"Skewness: {df['Feeding_time'].skew():.2f}")
print(f"Kurtosis: {df['Feeding_time'].kurt():.2f}")
Skewness: 0.55 Kurtosis: 1.19
# make a distribution plot of time between two consecutive feedings
sns.displot(df['Feeding_interval'], kde=True)
plt.xlabel('Minutes to the next feeding', fontsize = 12)
plt.ylabel('Frequency', fontsize = 12)
plt.title('Feeding interval distribution', fontsize = 15)
plt.show()
print(f"Skewness: {df['Feeding_interval'].skew():.2f}")
print(f"Kurtosis: {df['Feeding_interval'].kurt():.2f}")
Skewness: 1.15 Kurtosis: 1.27
For skewness: If the number is greater than +1 or lower than –1, this is an indication of a substantially skewed distribution.
For kurtosis: If the number is greater than +1, the distribution is too peaked. If the number is less than –1, the distribution is too flat.
# make a box plot of time of each feeding
ax = sns.boxplot(x=df['Feeding_time'])
ax.set_xlabel('Minutes of each feeding', fontsize = 12)
ax.set_title('Feeding time box plot', fontsize = 15, pad = 20)
plt.show()
# make a box plot of time of each feeding grouped by type
ax = sns.boxplot(x='Feeding_time', y='Type', data=df)
ax.set_xlabel('Minutes of each feeding', fontsize = 12)
ax.set_ylabel('Feeding type', fontsize = 12)
ax.set_title('Box plot of feeding time grouped by type', fontsize = 15, pad = 20)
plt.show()
# make a box plot of time between two consecutive feedings
ax = sns.boxplot(x=df['Feeding_interval'])
ax.set_xlabel('Minutes to the next feeding', fontsize = 12)
ax.set_title('Feeding interval box plot', fontsize = 15, pad = 20)
plt.show()
# make a box plot of time between two consecutive feedings grouped by type
ax = sns.boxplot(x='Feeding_interval', y='Type', data=df)
ax.set_xlabel('Minutes to the next feeding', fontsize = 12)
ax.set_ylabel('Feeding type', fontsize = 12)
ax.set_title('Box plot of feeding interval grouped by type', fontsize = 15, pad = 20)
plt.show()
# create a function to calculate IQR lower and upper bounds
def IQR_bounds(dataframe, column_name, multiple):
# determine the upper and lower quartiles
lower_quartile = dataframe[column_name].quantile(0.25)
upper_quartile = dataframe[column_name].quantile(0.75)
# determine the interquartile range
IQR = upper_quartile - lower_quartile
# get the upper and lower bounds
lower_bound = lower_quartile - multiple * IQR
upper_bound = upper_quartile + multiple * IQR
# return results
return lower_bound, upper_bound
# get outliers of feeding time
lower_bound, upper_bound = IQR_bounds(df, 'Feeding_time', 1.5)
IQR_outliers_feedtime = df[(df['Feeding_time'] <= lower_bound) | (df['Feeding_time'] >= upper_bound)]
IQR_outliers_feedtime
| Activity | Type | Start | Finish | Feeding_time | Feeding_interval | |
|---|---|---|---|---|---|---|
| 6 | Feeding | Left Breast | 2022-07-21 22:14:00 | 2022-07-21 22:44:00 | 30 | 201 |
| 9 | Feeding | Left Breast | 2022-07-22 07:05:00 | 2022-07-22 07:35:00 | 30 | 300 |
| 61 | Feeding | Left Breast | 2022-07-27 23:15:00 | 2022-07-27 23:46:00 | 31 | 25 |
| 108 | Feeding | Left Breast | 2022-08-02 15:40:00 | 2022-08-02 16:06:00 | 26 | 180 |
| 138 | Feeding | Left Breast | 2022-08-07 00:07:00 | 2022-08-07 00:33:00 | 26 | 71 |
# number of outliers of feeding time
len(IQR_outliers_feedtime)
5
# get outliers of feeding interval
lower_bound, upper_bound = IQR_bounds(df, 'Feeding_interval', 1.5)
IQR_outliers_feedinterval = df[(df['Feeding_interval'] <= lower_bound) | (df['Feeding_interval'] >= upper_bound)]
IQR_outliers_feedinterval
| Activity | Type | Start | Finish | Feeding_time | Feeding_interval | |
|---|---|---|---|---|---|---|
| 3 | Feeding | Right Breast | 2022-07-21 04:13:00 | 2022-07-21 04:28:00 | 15 | 572 |
| 120 | Feeding | Bottle | 2022-08-03 23:04:00 | 2022-08-03 23:13:00 | 9 | 533 |
| 128 | Feeding | Right Breast | 2022-08-05 09:57:00 | 2022-08-05 10:12:00 | 15 | 691 |
| 152 | Feeding | Right Breast | 2022-08-09 02:03:00 | 2022-08-09 02:23:00 | 20 | 661 |
| 163 | Feeding | Right Breast | 2022-08-11 10:14:00 | 2022-08-11 10:33:00 | 19 | 525 |
| 167 | Feeding | Left Breast | 2022-08-12 07:15:00 | 2022-08-12 07:32:00 | 17 | 623 |
| 169 | Feeding | Right Breast | 2022-08-12 20:16:00 | 2022-08-12 20:27:00 | 11 | 527 |
| 181 | Feeding | Left Breast | 2022-08-14 20:25:00 | 2022-08-14 20:35:00 | 10 | 687 |
# number of outliers of feeding interval
len(IQR_outliers_feedinterval)
8
Z-score is also called standard score. Z-score tells how many standard deviations away a data point is from the mean.
If the z-score of a data point is more than 3 or less than -3, it indicates that the data point can be an outlier.
# calculate the z-score of feeding time
df['zscore_feedtime'] = (df['Feeding_time'] - df['Feeding_time'].mean())/df['Feeding_time'].std()
# make a distribution plot of z-score
sns.displot(df['zscore_feedtime'], kde=True)
plt.xlabel('z-score', fontsize = 12)
plt.ylabel('Frequency', fontsize = 12)
plt.title('Standard feeding time distribution', fontsize = 15)
plt.show()
# get outliers of feeding time
zscore_outliers_feedtime = df[abs(df['zscore_feedtime']) >= 3]
zscore_outliers_feedtime
| Activity | Type | Start | Finish | Feeding_time | Feeding_interval | zscore_feedtime | |
|---|---|---|---|---|---|---|---|
| 6 | Feeding | Left Breast | 2022-07-21 22:14:00 | 2022-07-21 22:44:00 | 30 | 201 | 3.392534 |
| 9 | Feeding | Left Breast | 2022-07-22 07:05:00 | 2022-07-22 07:35:00 | 30 | 300 | 3.392534 |
| 61 | Feeding | Left Breast | 2022-07-27 23:15:00 | 2022-07-27 23:46:00 | 31 | 25 | 3.608302 |
# number of outliers of feeding time
len(zscore_outliers_feedtime)
3
# calculate the z-score of feeding interval
df['zscore_feedinterval'] = (df['Feeding_interval'] - df['Feeding_interval'].mean())/df['Feeding_interval'].std()
# make a distribution plot of z-score
sns.displot(df['zscore_feedinterval'], kde=True)
plt.xlabel('z-score', fontsize = 12)
plt.ylabel('Frequency', fontsize = 12)
plt.title('Standard feeding interval distribution', fontsize = 15)
plt.show()
# get outliers of feeding interval
zscore_outliers_feedinterval = df[abs(df['zscore_feedinterval']) >= 3]
zscore_outliers_feedinterval
| Activity | Type | Start | Finish | Feeding_time | Feeding_interval | zscore_feedtime | zscore_feedinterval | |
|---|---|---|---|---|---|---|---|---|
| 128 | Feeding | Right Breast | 2022-08-05 09:57:00 | 2022-08-05 10:12:00 | 15 | 691 | 0.156017 | 3.436103 |
| 152 | Feeding | Right Breast | 2022-08-09 02:03:00 | 2022-08-09 02:23:00 | 20 | 661 | 1.234856 | 3.230169 |
| 181 | Feeding | Left Breast | 2022-08-14 20:25:00 | 2022-08-14 20:35:00 | 10 | 687 | -0.922822 | 3.408645 |
# number of outliers of feeding interval
len(zscore_outliers_feedinterval)
3
# make a scatter plot of 2 variables: feeding time and feeding interval
fig, ax = plt.subplots(1, 1, figsize=(14, 7))
ax.scatter(df['Feeding_time'], df['Feeding_interval'])
ax.set_xlabel("Feeding time", fontsize = 15, labelpad = 15)
ax.set_ylabel("Feeding interval", fontsize = 15, labelpad = 15)
ax.set_title("Scatter plot of feeding time and feeding interval", fontsize = 15, pad = 20)
ax.tick_params(axis = "both", labelsize = 12)
# create a function to make an interactive scatter plot
def scatter_plot(dataframe, x_col_name, y_col_name, color_col_name, title, hover_name):
fig = px.scatter(dataframe, x = x_col_name, y = y_col_name, color = color_col_name, hover_name = hover_name)
fig.update_layout(title = title, title_x = 0.5)
fig.show()
Density-Based Spatial Clustering of Applications with Noise (DBSCAN) is a density-based clustering non-parametric algorithm.
Mechanism: it randomly selects a point that is not already assigned to a cluster or designated as an outlier, then determines if it is a core point by checking if at least a given minimum number of samples exist within a given distance. If so, then it is designated as a core point along with all points within direct reach of that point. This process is repeated until the edge of the cluster is identified where there are no more points within the epsilon disance of the cluster.
If a point does not fall within any of the potential clusters then it is deemed an outlier.
The benefit of this method is that it is unsupervised and can be used when the distribution of values in the feature space cannot be assumed.
# create the method instance
# eps : maximum distance between two samples for them to be considered as in the same neighborhood
# min_samples : the number of samples in a neighborhood for a point to be considered as a core point (this includes the point itself).
# n_jobs : the number of parallel jobs to run. 1 means None. -1 means using all processors.
dbs = DBSCAN(eps = 30, min_samples = 10, metric = "euclidean", n_jobs = -1)
# extract outliers from the data
df["outlier_DBSCAN"] = dbs.fit_predict(df[["Feeding_time", "Feeding_interval"]])
df["outlier_DBSCAN"] = df["outlier_DBSCAN"].apply(lambda x: str(1) if x>-1 else str(-1))
df["outlier_DBSCAN"] = df["outlier_DBSCAN"].astype(str)
# create a scatter plot of outliers
scatter_plot(df, "Feeding_time", "Feeding_interval", "outlier_DBSCAN",
"DBSCAN Outlier Detection", "Start")
# get outliers
df[df['outlier_DBSCAN'] == "-1"].iloc[:, [0,1,2,3,4,5,8]]
| Activity | Type | Start | Finish | Feeding_time | Feeding_interval | outlier_DBSCAN | |
|---|---|---|---|---|---|---|---|
| 3 | Feeding | Right Breast | 2022-07-21 04:13:00 | 2022-07-21 04:28:00 | 15 | 572 | -1 |
| 47 | Feeding | Bottle | 2022-07-25 23:40:00 | 2022-07-25 23:53:00 | 13 | 513 | -1 |
| 120 | Feeding | Bottle | 2022-08-03 23:04:00 | 2022-08-03 23:13:00 | 9 | 533 | -1 |
| 128 | Feeding | Right Breast | 2022-08-05 09:57:00 | 2022-08-05 10:12:00 | 15 | 691 | -1 |
| 146 | Feeding | Right Breast | 2022-08-07 21:27:00 | 2022-08-07 21:46:00 | 19 | 462 | -1 |
| 150 | Feeding | Right Breast | 2022-08-08 15:33:00 | 2022-08-08 15:53:00 | 20 | 435 | -1 |
| 152 | Feeding | Right Breast | 2022-08-09 02:03:00 | 2022-08-09 02:23:00 | 20 | 661 | -1 |
| 163 | Feeding | Right Breast | 2022-08-11 10:14:00 | 2022-08-11 10:33:00 | 19 | 525 | -1 |
| 167 | Feeding | Left Breast | 2022-08-12 07:15:00 | 2022-08-12 07:32:00 | 17 | 623 | -1 |
| 169 | Feeding | Right Breast | 2022-08-12 20:16:00 | 2022-08-12 20:27:00 | 11 | 527 | -1 |
| 173 | Feeding | Left Breast | 2022-08-13 15:43:00 | 2022-08-13 16:00:00 | 17 | 431 | -1 |
| 181 | Feeding | Left Breast | 2022-08-14 20:25:00 | 2022-08-14 20:35:00 | 10 | 687 | -1 |
| 184 | Feeding | Left Breast | 2022-08-15 19:23:00 | 2022-08-15 19:38:00 | 15 | 482 | -1 |
| 188 | Feeding | Right Breast | 2022-08-16 11:47:00 | 2022-08-16 12:08:00 | 21 | 463 | -1 |
| 189 | Feeding | Left Breast | 2022-08-16 19:51:00 | 2022-08-16 20:04:00 | 13 | 494 | -1 |
# number of outliers
print(len(df[df['outlier_DBSCAN'] == "-1"]))
15
The Local Outlier Factor algorithm works similarly to DBSCAN algorithm in that it examines neighbors of a point but behaves a bit differently.
The Local Outlier Factor algorithm examines a point and its neighbors to find its density and compare with the density of neighbors. If the density of a point is much smaller than that of its neighbors, it is suggested that this point is an outlier.
The key points of this algorithm are the number of neighbors to be compared with and the metric to calculate the density.
Default number of neighbors: 20 (this number should be greater if the proportion of outliers is more than 10%).
Default metric: Minkowski distance, which generalises both Euclidean distance and Manhattan distance.
The benefit of this algorithm is that it can take both the local and global properties of the dataset into account as it focuses on how isolated the sample is in respect to the surrounding neighbourhood.
# create the method instance
lof = LocalOutlierFactor(n_neighbors = 20)
# extract outliers from the data
df["outlier_local_outlier_factor"] = lof.fit_predict(df[["Feeding_time", "Feeding_interval"]])
df["outlier_local_outlier_factor"] = df["outlier_local_outlier_factor"].astype(str)
# extract the scores of strength of outliers
df["score_local_outlier_factor"] = lof.negative_outlier_factor_
# create a scatter plot of strength of outliers
scatter_plot(df, "Feeding_time", "Feeding_interval", "score_local_outlier_factor",
"Local Outlier Factor Outlier Detection Scores", "Start")
# create a scatter plot of outliers
scatter_plot(df, "Feeding_time", "Feeding_interval", "outlier_local_outlier_factor",
"Local Outlier Factor Outlier Detection", "Start")
# get outliers
df[df['outlier_local_outlier_factor'] == "-1"].iloc[:, [0,1,2,3,4,5,9,10]]
| Activity | Type | Start | Finish | Feeding_time | Feeding_interval | outlier_local_outlier_factor | score_local_outlier_factor | |
|---|---|---|---|---|---|---|---|---|
| 3 | Feeding | Right Breast | 2022-07-21 04:13:00 | 2022-07-21 04:28:00 | 15 | 572 | -1 | -1.573256 |
| 128 | Feeding | Right Breast | 2022-08-05 09:57:00 | 2022-08-05 10:12:00 | 15 | 691 | -1 | -2.174212 |
| 152 | Feeding | Right Breast | 2022-08-09 02:03:00 | 2022-08-09 02:23:00 | 20 | 661 | -1 | -1.998145 |
| 167 | Feeding | Left Breast | 2022-08-12 07:15:00 | 2022-08-12 07:32:00 | 17 | 623 | -1 | -1.800302 |
| 181 | Feeding | Left Breast | 2022-08-14 20:25:00 | 2022-08-14 20:35:00 | 10 | 687 | -1 | -2.147838 |
# number of outliers
print(len(df[df['outlier_local_outlier_factor'] == "-1"]))
5
Unlike other algorithms where the focus is on the normal data and then anomalies identification, the focus of Isolation Forest is intitially on anomalies identification and then the normal data.
For this algorithm, at first we need to specify the contanimation parameter, which is the proportion of the data expected to be anomalies.
From the results of other algorithms: 15 out of 195 are outliers, we will set the contamination parameter to 0.07 (7%).
# create the method instance
iso_for = IsolationForest(n_estimators = 100, random_state = 34, contamination = 0.07)
# extract outliers from the data
df["outlier_isolation_forest"] = iso_for.fit_predict(df[["Feeding_time", "Feeding_interval"]])
df["outlier_isolation_forest"] = df["outlier_isolation_forest"].astype(str)
# extract the scores of strength of outliers
df["score_isolation_forest"] = iso_for.decision_function(df[["Feeding_time", "Feeding_interval"]])
# create a scatter plot of strength of outliers
scatter_plot(df, "Feeding_time", "Feeding_interval", "score_isolation_forest",
"Isolation Forest Outlier Detection Scores", "Start")
# create a scatter plot of outliers
scatter_plot(df, "Feeding_time", "Feeding_interval", "outlier_isolation_forest",
"Isolation Forest Outlier Detection", "Start")
# get outliers
df[df['outlier_isolation_forest'] == "-1"].iloc[:, [0,1,2,3,4,5,11,12]]
| Activity | Type | Start | Finish | Feeding_time | Feeding_interval | outlier_isolation_forest | score_isolation_forest | |
|---|---|---|---|---|---|---|---|---|
| 6 | Feeding | Left Breast | 2022-07-21 22:14:00 | 2022-07-21 22:44:00 | 30 | 201 | -1 | -0.074472 |
| 9 | Feeding | Left Breast | 2022-07-22 07:05:00 | 2022-07-22 07:35:00 | 30 | 300 | -1 | -0.114167 |
| 14 | Feeding | Bottle | 2022-07-22 20:50:00 | 2022-07-22 20:55:00 | 5 | 25 | -1 | -0.006361 |
| 49 | Feeding | Bottle | 2022-07-26 12:17:00 | 2022-07-26 12:23:00 | 6 | 339 | -1 | -0.000430 |
| 61 | Feeding | Left Breast | 2022-07-27 23:15:00 | 2022-07-27 23:46:00 | 31 | 25 | -1 | -0.150260 |
| 108 | Feeding | Left Breast | 2022-08-02 15:40:00 | 2022-08-02 16:06:00 | 26 | 180 | -1 | -0.036311 |
| 114 | Feeding | Bottle | 2022-08-03 01:57:00 | 2022-08-03 02:02:00 | 5 | 414 | -1 | -0.062942 |
| 120 | Feeding | Bottle | 2022-08-03 23:04:00 | 2022-08-03 23:13:00 | 9 | 533 | -1 | -0.023518 |
| 126 | Feeding | Right Breast | 2022-08-04 23:57:00 | 2022-08-05 00:20:00 | 23 | 399 | -1 | -0.029719 |
| 128 | Feeding | Right Breast | 2022-08-05 09:57:00 | 2022-08-05 10:12:00 | 15 | 691 | -1 | -0.080554 |
| 138 | Feeding | Left Breast | 2022-08-07 00:07:00 | 2022-08-07 00:33:00 | 26 | 71 | -1 | -0.062691 |
| 152 | Feeding | Right Breast | 2022-08-09 02:03:00 | 2022-08-09 02:23:00 | 20 | 661 | -1 | -0.083744 |
| 167 | Feeding | Left Breast | 2022-08-12 07:15:00 | 2022-08-12 07:32:00 | 17 | 623 | -1 | -0.036936 |
| 181 | Feeding | Left Breast | 2022-08-14 20:25:00 | 2022-08-14 20:35:00 | 10 | 687 | -1 | -0.099950 |
# number of outliers
print(len(df[df['outlier_isolation_forest'] == "-1"]))
14
Elliptic Envelope algorithm assumes a Gaussian distribution of the data. It tries to create an imaginery elliptic area around a given dataset where values inside that ellipse are taken to be normal data and anything outside of that are assumed to be outliers.
For this algorithm, at first we need to specify the contanimation parameter, which is the proportion of the data expected to be anomalies.
From the results of other algorithms: 15 out of 195 are outliers, we will set the contamination parameter to 0.07 (7%).
# create the method instance
ell_env = EllipticEnvelope(contamination = 0.07)
# extract outliers from the data
df["outlier_elliptic_envelope"] = ell_env.fit_predict(df[["Feeding_time", "Feeding_interval"]])
df["outlier_elliptic_envelope"] = df["outlier_elliptic_envelope"].apply(lambda x: str(-1) if x == -1 else str(1))
df["outlier_elliptic_envelope"] = df["outlier_elliptic_envelope"].astype(str)
# extract the scores of strength of outliers
df["score_elliptic_envelope"] = ell_env.score_samples(df[["Feeding_time", "Feeding_interval"]])
# create a scatter plot of strength of outliers
scatter_plot(df, "Feeding_time", "Feeding_interval", "score_elliptic_envelope",
"Elliptic Envelope Outlier Detection Scores", "Start")
# create a scatter plot of outliers
scatter_plot(df, "Feeding_time", "Feeding_interval", "outlier_elliptic_envelope",
"Elliptic Envelope Outlier Detection", "Start")
# get outliers
df[df['outlier_elliptic_envelope'] == "-1"].iloc[:, [0,1,2,3,4,5,13,14]]
| Activity | Type | Start | Finish | Feeding_time | Feeding_interval | outlier_elliptic_envelope | score_elliptic_envelope | |
|---|---|---|---|---|---|---|---|---|
| 3 | Feeding | Right Breast | 2022-07-21 04:13:00 | 2022-07-21 04:28:00 | 15 | 572 | -1 | -15.340802 |
| 6 | Feeding | Left Breast | 2022-07-21 22:14:00 | 2022-07-21 22:44:00 | 30 | 201 | -1 | -18.530930 |
| 9 | Feeding | Left Breast | 2022-07-22 07:05:00 | 2022-07-22 07:35:00 | 30 | 300 | -1 | -19.943052 |
| 61 | Feeding | Left Breast | 2022-07-27 23:15:00 | 2022-07-27 23:46:00 | 31 | 25 | -1 | -22.776111 |
| 114 | Feeding | Bottle | 2022-08-03 01:57:00 | 2022-08-03 02:02:00 | 5 | 414 | -1 | -11.532893 |
| 120 | Feeding | Bottle | 2022-08-03 23:04:00 | 2022-08-03 23:13:00 | 9 | 533 | -1 | -14.378320 |
| 128 | Feeding | Right Breast | 2022-08-05 09:57:00 | 2022-08-05 10:12:00 | 15 | 691 | -1 | -25.471249 |
| 138 | Feeding | Left Breast | 2022-08-07 00:07:00 | 2022-08-07 00:33:00 | 26 | 71 | -1 | -11.334525 |
| 152 | Feeding | Right Breast | 2022-08-09 02:03:00 | 2022-08-09 02:23:00 | 20 | 661 | -1 | -24.975548 |
| 163 | Feeding | Right Breast | 2022-08-11 10:14:00 | 2022-08-11 10:33:00 | 19 | 525 | -1 | -13.670159 |
| 167 | Feeding | Left Breast | 2022-08-12 07:15:00 | 2022-08-12 07:32:00 | 17 | 623 | -1 | -19.874392 |
| 169 | Feeding | Right Breast | 2022-08-12 20:16:00 | 2022-08-12 20:27:00 | 11 | 527 | -1 | -12.811697 |
| 181 | Feeding | Left Breast | 2022-08-14 20:25:00 | 2022-08-14 20:35:00 | 10 | 687 | -1 | -26.327654 |
| 188 | Feeding | Right Breast | 2022-08-16 11:47:00 | 2022-08-16 12:08:00 | 21 | 463 | -1 | -11.639295 |
# number of outliers
print(len(df[df['outlier_elliptic_envelope'] == "-1"]))
14
As all four methods above show different outliers, we should use a combination of all results to decide the outliers.
# get the sum of outliers of all methods (scores of strength of final outliers)
df['outlier_sum'] = (df['outlier_DBSCAN'].astype(int)
+df['outlier_local_outlier_factor'].astype(int)
+df['outlier_isolation_forest'].astype(int)
+df['outlier_elliptic_envelope'].astype(int))
# count of each score
print(df["outlier_sum"].value_counts())
4 171 2 10 0 8 -4 4 -2 2 Name: outlier_sum, dtype: int64
Explanation:
4 = 4 - 0 : 4 normal points and 0 outliers in all 4 methods
2 = 3 - 1 : 3 normal points in 3 methods and 1 outlier in another method
0 = 2 - 2 : 2 normal points in 2 methods and 2 outliers in 2 other methods
-2 = 1 - 3 : 1 normal point in 1 method and 3 outliers in 3 other methods
-4 = 0 - 4 : 0 normal points and 4 outliers in all 4 methods
A decision should be made by majority, so if a point is marked as outliers in 3 or more methods, it is the final outlier (score <= -2).
# extract final outliers (score <= -2)
df["outlier_final"] = df["outlier_sum"].apply(lambda x: str(1) if x > -2 else str(-1))
# create a scatter plot of strength of final outliers
scatter_plot(df, "Feeding_time", "Feeding_interval", "outlier_sum",
"Ensemble Outlier Detection Scores", "Start")
# create a scatter plot of final outliers
scatter_plot(df, "Feeding_time", "Feeding_interval", "outlier_final",
"Ensemble Outlier Detection", "Start")
# get final outliers
df[df['outlier_final'] == "-1"].iloc[:, [0,1,2,3,4,5,15,16]]
| Activity | Type | Start | Finish | Feeding_time | Feeding_interval | outlier_sum | outlier_final | |
|---|---|---|---|---|---|---|---|---|
| 3 | Feeding | Right Breast | 2022-07-21 04:13:00 | 2022-07-21 04:28:00 | 15 | 572 | -2 | -1 |
| 120 | Feeding | Bottle | 2022-08-03 23:04:00 | 2022-08-03 23:13:00 | 9 | 533 | -2 | -1 |
| 128 | Feeding | Right Breast | 2022-08-05 09:57:00 | 2022-08-05 10:12:00 | 15 | 691 | -4 | -1 |
| 152 | Feeding | Right Breast | 2022-08-09 02:03:00 | 2022-08-09 02:23:00 | 20 | 661 | -4 | -1 |
| 167 | Feeding | Left Breast | 2022-08-12 07:15:00 | 2022-08-12 07:32:00 | 17 | 623 | -4 | -1 |
| 181 | Feeding | Left Breast | 2022-08-14 20:25:00 | 2022-08-14 20:35:00 | 10 | 687 | -4 | -1 |
# number of final outliers
print(len(df[df['outlier_final'] == "-1"]))
6